Introduction: From two data sets called Beer and Breweries provided by Budweiser, we can find important information for Budweiser. The facts we focus on are alcohol, Beer size, Alcohol by volume of the beer (ABV), and International Bitterness Units of the beer (IBU). This data set consists of information about 2410 US craft beers from 558 US breweries. This data analysis aims to provide descriptive information about the current craft beer market in the United States.
Lets Load some libraries

library(tidyverse)
library(gridExtra)
library(class)
library(usmap)
library(ggplot2)
library(foreign)
library(haven)
library(ggplot2)
library(foreign)
library(ggplot2)
library(GGally)
library(haven)
library(magrittr)
library(data.table)
library(dplyr)
library(plyr)
library(dplyr)
library(factoextra)
library(ggplot2)
library(ggmap)
library(nycflights13)
library(tidyverse)
library(datasets)
library(readxl)
library(tidyverse) 
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(plyr)
library(gridExtra)
library(factoextra)
library(GGally)
library(readxl)
library(tidyverse) 
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(gridExtra)
library(factoextra)
library(GGally)
library(gridExtra)
library(graphics)
library(mice)
library(PerformanceAnalytics)
require(PerformanceAnalytics)
library(MASS)
library(reshape)

Lets load the data

Beers4=read.csv("/Users/owner/Desktop/homework/unit8/Beers 4.csv")

Breweries = read.csv("/Users/owner/Desktop/homework/unit8/Breweries.csv")

Lets look at the varibles

colnames(Breweries)
## [1] "Brew_ID" "Name"    "City"    "State"
colnames(Beers4)
## [1] "Name"       "Beer_ID"    "ABV"        "IBU"        "Brewery_id"
## [6] "Style"      "Ounces"
State=as.factor(Breweries$State)
levels(State)
##  [1] " AK" " AL" " AR" " AZ" " CA" " CO" " CT" " DC" " DE" " FL" " GA" " HI"
## [13] " IA" " ID" " IL" " IN" " KS" " KY" " LA" " MA" " MD" " ME" " MI" " MN"
## [25] " MO" " MS" " MT" " NC" " ND" " NE" " NH" " NJ" " NM" " NV" " NY" " OH"
## [37] " OK" " OR" " PA" " RI" " SC" " SD" " TN" " TX" " UT" " VA" " VT" " WA"
## [49] " WI" " WV" " WY"

Colorado has the highest number (47) of breweries, and California has 39 breweries, which is second. In Colorado, most brewery are in Boulder and Denver. Also, California most breweries are in San Diego.

  1. How many breweries are present in each state?
df1 = data.frame(table(Breweries$State))
colnames(df1) = c("State","Breweries")
df1 = df1[order(-df1$Breweries),]
row.names(df1) = NULL
df1
##    State Breweries
## 1     CO        47
## 2     CA        39
## 3     MI        32
## 4     OR        29
## 5     TX        28
## 6     PA        25
## 7     MA        23
## 8     WA        23
## 9     IN        22
## 10    WI        20
## 11    NC        19
## 12    IL        18
## 13    NY        16
## 14    VA        16
## 15    FL        15
## 16    OH        15
## 17    MN        12
## 18    AZ        11
## 19    VT        10
## 20    ME         9
## 21    MO         9
## 22    MT         9
## 23    CT         8
## 24    AK         7
## 25    GA         7
## 26    MD         7
## 27    OK         6
## 28    IA         5
## 29    ID         5
## 30    LA         5
## 31    NE         5
## 32    RI         5
## 33    HI         4
## 34    KY         4
## 35    NM         4
## 36    SC         4
## 37    UT         4
## 38    WY         4
## 39    AL         3
## 40    KS         3
## 41    NH         3
## 42    NJ         3
## 43    TN         3
## 44    AR         2
## 45    DE         2
## 46    MS         2
## 47    NV         2
## 48    DC         1
## 49    ND         1
## 50    SD         1
## 51    WV         1
df1$state = trimws(as.character(df1$State))
p = plot_usmap(data = df1, values = "Breweries",labels = TRUE, color = "red") + 
  scale_fill_continuous(low = "white", high = "red", 
                        name = "Breweries", label = scales::comma) + 
  theme(legend.position = "right")

p$layers[[2]]$aes_params$size <- 2.5
p

US map also shows the highest number of breweries, and California is the second.

  1. Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.
merged = merge(x = Beers4, y = Breweries, by.x = "Brewery_id", by.y = "Brew_ID", all.x = TRUE)
Beer = merged$Name.x
Brewery=merged$Name.y
head(merged,6)
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces             Name.y        City
## 1                        American IPA     16 NorthGate Brewing  Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing  Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing  Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing  Minneapolis
## 5                     American Porter     16 NorthGate Brewing  Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing  Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN
tail(merged,6)
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK

#visualize the missing data

  1. Address the missing values in each column.
sum(is.na(merged))
## [1] 1067
which(is.na(merged)) 
##    [1]  7305  7306  7416  7423  7457  7482  7670  7671  7738  7739  7740  7741
##   [13]  7798  7799  7800  7801  7802  7803  7804  7859  8019  8021  8023  8130
##   [25]  8222  8223  8224  8226  8388  8528  8654  8710  8861  9117  9122  9149
##   [37]  9154  9181  9326  9407  9408  9414  9457  9464  9472  9489  9490  9491
##   [49]  9492  9526  9575  9581  9582  9583  9584  9585  9586  9587  9595  9600
##   [61]  9624  9625  9657  9675  9676  9677  9678  9679  9680  9681  9682  9683
##   [73]  9684  9685  9686  9688  9689  9691  9692  9694  9695  9696  9697  9698
##   [85]  9699  9700  9701  9702  9703  9704  9705  9706  9707  9708  9709  9710
##   [97]  9711  9712  9713  9714  9715  9716  9717  9718  9719  9720  9721  9722
##  [109]  9723  9724  9725  9726  9727  9728  9729  9730  9731  9732  9733  9734
##  [121]  9735  9736  9737  9738  9739  9740  9741  9742  9743  9744  9745  9746
##  [133]  9747  9748  9749  9750  9751  9752  9753  9754  9755  9756  9757  9764
##  [145]  9765  9767  9770  9775  9779  9785  9792  9798  9799  9800  9803  9804
##  [157]  9805  9808  9809  9812  9815  9826  9833  9834  9838  9843  9854  9855
##  [169]  9856  9858  9860  9867  9875  9877  9878  9884  9892  9893  9894  9895
##  [181]  9896  9897  9899  9900  9903  9907  9909  9910  9911  9912  9913  9914
##  [193]  9917  9918  9927  9931  9932  9933  9934  9936  9937  9938  9939  9940
##  [205]  9941  9942  9943  9944  9945  9946  9947  9948  9949  9951  9968  9969
##  [217]  9978 10000 10001 10002 10003 10006 10007 10010 10011 10012 10013 10015
##  [229] 10016 10019 10020 10024 10025 10027 10030 10031 10032 10033 10035 10037
##  [241] 10039 10044 10050 10051 10052 10053 10056 10057 10058 10059 10060 10062
##  [253] 10063 10079 10080 10081 10083 10085 10086 10087 10088 10089 10090 10091
##  [265] 10093 10094 10097 10099 10104 10105 10108 10117 10118 10121 10125 10132
##  [277] 10133 10135 10136 10142 10143 10144 10145 10146 10147 10148 10149 10150
##  [289] 10151 10155 10178 10179 10180 10181 10186 10187 10188 10189 10197 10199
##  [301] 10201 10202 10208 10209 10210 10211 10212 10213 10214 10215 10216 10225
##  [313] 10226 10227 10229 10236 10237 10238 10239 10240 10251 10252 10253 10254
##  [325] 10255 10256 10261 10266 10267 10268 10269 10274 10275 10278 10284 10294
##  [337] 10295 10296 10297 10299 10301 10302 10303 10304 10305 10306 10322 10323
##  [349] 10336 10344 10353 10355 10358 10363 10365 10369 10372 10385 10388 10389
##  [361] 10391 10398 10402 10406 10416 10421 10429 10430 10431 10433 10435 10445
##  [373] 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457
##  [385] 10465 10466 10468 10469 10470 10471 10478 10479 10480 10481 10484 10488
##  [397] 10489 10493 10496 10501 10502 10503 10515 10516 10517 10518 10519 10520
##  [409] 10521 10522 10523 10527 10528 10529 10530 10531 10532 10540 10541 10551
##  [421] 10552 10553 10554 10556 10572 10574 10575 10587 10591 10595 10596 10597
##  [433] 10598 10599 10600 10603 10604 10606 10607 10630 10631 10632 10633 10634
##  [445] 10635 10636 10640 10641 10643 10650 10651 10653 10662 10663 10664 10668
##  [457] 10669 10672 10673 10674 10675 10690 10691 10694 10695 10696 10697 10698
##  [469] 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710
##  [481] 10711 10712 10713 10714 10715 10716 10717 10719 10720 10721 10722 10723
##  [493] 10724 10725 10726 10727 10728 10729 10730 10731 10733 10734 10736 10737
##  [505] 10743 10758 10759 10760 10761 10762 10763 10768 10773 10780 10781 10794
##  [517] 10795 10796 10797 10798 10799 10800 10801 10803 10819 10820 10821 10822
##  [529] 10824 10825 10827 10828 10831 10838 10839 10840 10841 10842 10845 10846
##  [541] 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10860
##  [553] 10870 10871 10872 10873 10875 10877 10878 10879 10880 10881 10882 10883
##  [565] 10894 10895 10905 10908 10926 10929 10932 10933 10937 10938 10939 10940
##  [577] 10941 10942 10943 10950 10957 10967 10968 10969 10970 10971 10972 10979
##  [589] 10991 11002 11010 11013 11014 11017 11019 11021 11022 11023 11026 11027
##  [601] 11028 11029 11030 11031 11032 11034 11036 11038 11039 11040 11041 11044
##  [613] 11051 11053 11054 11055 11056 11057 11058 11059 11062 11063 11064 11066
##  [625] 11067 11071 11073 11083 11086 11087 11088 11089 11090 11091 11096 11108
##  [637] 11109 11110 11111 11117 11118 11119 11120 11127 11132 11141 11143 11146
##  [649] 11148 11149 11150 11151 11155 11163 11165 11182 11184 11185 11186 11187
##  [661] 11188 11190 11191 11192 11193 11196 11197 11203 11205 11206 11208 11209
##  [673] 11210 11211 11212 11213 11215 11216 11217 11218 11220 11225 11229 11232
##  [685] 11236 11237 11240 11241 11242 11243 11244 11245 11247 11250 11253 11254
##  [697] 11260 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277
##  [709] 11280 11290 11291 11294 11295 11301 11302 11303 11304 11305 11306 11307
##  [721] 11308 11314 11315 11316 11317 11318 11319 11320 11323 11324 11325 11337
##  [733] 11346 11347 11348 11351 11352 11367 11368 11369 11375 11377 11378 11379
##  [745] 11380 11381 11382 11383 11385 11386 11388 11389 11394 11395 11401 11405
##  [757] 11406 11412 11419 11420 11421 11422 11423 11424 11425 11428 11429 11432
##  [769] 11444 11445 11446 11452 11453 11456 11472 11474 11475 11476 11477 11479
##  [781] 11480 11481 11484 11494 11495 11501 11502 11503 11504 11514 11516 11517
##  [793] 11518 11519 11522 11526 11527 11528 11529 11530 11531 11532 11533 11534
##  [805] 11535 11540 11549 11554 11555 11556 11557 11558 11559 11560 11561 11562
##  [817] 11563 11564 11565 11566 11569 11570 11571 11572 11579 11580 11581 11582
##  [829] 11583 11590 11591 11592 11593 11594 11599 11600 11601 11602 11603 11606
##  [841] 11613 11614 11615 11616 11617 11618 11619 11620 11623 11624 11625 11631
##  [853] 11632 11633 11634 11635 11636 11637 11639 11640 11641 11642 11644 11646
##  [865] 11647 11648 11649 11650 11654 11655 11656 11657 11658 11662 11663 11672
##  [877] 11673 11674 11675 11676 11680 11681 11682 11683 11685 11686 11688 11689
##  [889] 11690 11693 11695 11703 11704 11705 11706 11711 11712 11713 11714 11715
##  [901] 11716 11717 11718 11722 11724 11725 11726 11727 11728 11729 11734 11735
##  [913] 11736 11742 11745 11746 11752 11753 11754 11755 11765 11772 11774 11775
##  [925] 11776 11777 11783 11784 11785 11786 11787 11792 11794 11795 11796 11797
##  [937] 11798 11799 11800 11808 11809 11810 11811 11812 11815 11817 11818 11819
##  [949] 11821 11822 11823 11824 11825 11826 11827 11836 11837 11838 11841 11844
##  [961] 11845 11846 11854 11855 11856 11857 11867 11868 11874 11875 11879 11882
##  [973] 11885 11888 11892 11893 11894 11899 11900 11901 11902 11905 11906 11907
##  [985] 11909 11910 11911 11912 11924 11926 11927 11928 11930 11931 11932 11933
##  [997] 11934 11935 11936 11945 11946 11947 11948 11949 11950 11951 11956 11957
## [1009] 11963 11964 11965 11966 11967 11970 11971 11972 11973 11982 11983 11984
## [1021] 11985 11987 11991 11992 11993 11994 11995 11996 11997 12001 12002 12003
## [1033] 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015
## [1045] 12018 12019 12022 12023 12032 12033 12034 12035 12036 12037 12038 12039
## [1057] 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050
plot_missing(merged)

p=function(x) {sum(is.na(x))/length(x)*100}
apply(merged,2,p)  
## Brewery_id     Name.x    Beer_ID        ABV        IBU      Style     Ounces 
##   0.000000   0.000000   0.000000   2.572614  41.701245   0.000000   0.000000 
##     Name.y       City      State 
##   0.000000   0.000000   0.000000
md.pattern(merged)

##      Brewery_id Name.x Beer_ID Style Ounces Name.y City State ABV  IBU     
## 1405          1      1       1     1      1      1    1     1   1    1    0
## 943           1      1       1     1      1      1    1     1   1    0    1
## 62            1      1       1     1      1      1    1     1   0    0    2
##               0      0       0     0      0      0    0     0  62 1005 1067
md.pairs(merged)
## $rr
##            Brewery_id Name.x Beer_ID  ABV  IBU Style Ounces Name.y City State
## Brewery_id       2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## Name.x           2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## Beer_ID          2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## ABV              2348   2348    2348 2348 1405  2348   2348   2348 2348  2348
## IBU              1405   1405    1405 1405 1405  1405   1405   1405 1405  1405
## Style            2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## Ounces           2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## Name.y           2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## City             2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## State            2410   2410    2410 2348 1405  2410   2410   2410 2410  2410
## 
## $rm
##            Brewery_id Name.x Beer_ID ABV  IBU Style Ounces Name.y City State
## Brewery_id          0      0       0  62 1005     0      0      0    0     0
## Name.x              0      0       0  62 1005     0      0      0    0     0
## Beer_ID             0      0       0  62 1005     0      0      0    0     0
## ABV                 0      0       0   0  943     0      0      0    0     0
## IBU                 0      0       0   0    0     0      0      0    0     0
## Style               0      0       0  62 1005     0      0      0    0     0
## Ounces              0      0       0  62 1005     0      0      0    0     0
## Name.y              0      0       0  62 1005     0      0      0    0     0
## City                0      0       0  62 1005     0      0      0    0     0
## State               0      0       0  62 1005     0      0      0    0     0
## 
## $mr
##            Brewery_id Name.x Beer_ID ABV IBU Style Ounces Name.y City State
## Brewery_id          0      0       0   0   0     0      0      0    0     0
## Name.x              0      0       0   0   0     0      0      0    0     0
## Beer_ID             0      0       0   0   0     0      0      0    0     0
## ABV                62     62      62   0   0    62     62     62   62    62
## IBU              1005   1005    1005 943   0  1005   1005   1005 1005  1005
## Style               0      0       0   0   0     0      0      0    0     0
## Ounces              0      0       0   0   0     0      0      0    0     0
## Name.y              0      0       0   0   0     0      0      0    0     0
## City                0      0       0   0   0     0      0      0    0     0
## State               0      0       0   0   0     0      0      0    0     0
## 
## $mm
##            Brewery_id Name.x Beer_ID ABV  IBU Style Ounces Name.y City State
## Brewery_id          0      0       0   0    0     0      0      0    0     0
## Name.x              0      0       0   0    0     0      0      0    0     0
## Beer_ID             0      0       0   0    0     0      0      0    0     0
## ABV                 0      0       0  62   62     0      0      0    0     0
## IBU                 0      0       0  62 1005     0      0      0    0     0
## Style               0      0       0   0    0     0      0      0    0     0
## Ounces              0      0       0   0    0     0      0      0    0     0
## Name.y              0      0       0   0    0     0      0      0    0     0
## City                0      0       0   0    0     0      0      0    0     0
## State               0      0       0   0    0     0      0      0    0     0

ABV and IBU columns contain missing values. The ABV column contains 62 missing values, and the IBU column contains 1005 missing values. Since ABV contains fewer missing values, we can use medians at the state level to replace those values. IBU contains many missing values. So, we use a predictive regression model to predict the IBU value based on ABV.

Now, we will replace the missing data with the median

for(i in 1:ncol(merged))
{
  if(is.numeric(merged[,i]))
  {
    merged[is.na(merged[,i]), i] <- median(merged[,i], na.rm = TRUE)
  }
}

Missing map after replacing of the data

plot_missing(merged)

  1. Compute the median alcohol content and international bitterness unit for each state.
ABV.med = aggregate(ABV ~ State, data = merged, FUN = median)
IBU.med = aggregate(IBU ~ State, data = merged, FUN = median)
ABV.med
##    State    ABV
## 1     AK 0.0560
## 2     AL 0.0600
## 3     AR 0.0520
## 4     AZ 0.0560
## 5     CA 0.0580
## 6     CO 0.0590
## 7     CT 0.0600
## 8     DC 0.0625
## 9     DE 0.0555
## 10    FL 0.0560
## 11    GA 0.0550
## 12    HI 0.0540
## 13    IA 0.0555
## 14    ID 0.0565
## 15    IL 0.0580
## 16    IN 0.0580
## 17    KS 0.0500
## 18    KY 0.0600
## 19    LA 0.0520
## 20    MA 0.0540
## 21    MD 0.0580
## 22    ME 0.0510
## 23    MI 0.0600
## 24    MN 0.0560
## 25    MO 0.0550
## 26    MS 0.0580
## 27    MT 0.0550
## 28    NC 0.0570
## 29    ND 0.0500
## 30    NE 0.0560
## 31    NH 0.0550
## 32    NJ 0.0460
## 33    NM 0.0610
## 34    NV 0.0580
## 35    NY 0.0550
## 36    OH 0.0580
## 37    OK 0.0600
## 38    OR 0.0560
## 39    PA 0.0560
## 40    RI 0.0550
## 41    SC 0.0550
## 42    SD 0.0600
## 43    TN 0.0570
## 44    TX 0.0560
## 45    UT 0.0400
## 46    VA 0.0565
## 47    VT 0.0550
## 48    WA 0.0555
## 49    WI 0.0520
## 50    WV 0.0620
## 51    WY 0.0500
IBU.med
##    State  IBU
## 1     AK 35.0
## 2     AL 39.5
## 3     AR 35.0
## 4     AZ 35.0
## 5     CA 35.0
## 6     CO 35.0
## 7     CT 35.0
## 8     DC 35.0
## 9     DE 43.5
## 10    FL 35.0
## 11    GA 35.0
## 12    HI 35.0
## 13    IA 29.5
## 14    ID 35.0
## 15    IL 35.0
## 16    IN 35.0
## 17    KS 22.0
## 18    KY 35.0
## 19    LA 35.0
## 20    MA 35.0
## 21    MD 35.0
## 22    ME 35.0
## 23    MI 35.0
## 24    MN 38.0
## 25    MO 32.5
## 26    MS 45.0
## 27    MT 35.0
## 28    NC 35.0
## 29    ND 32.0
## 30    NE 35.0
## 31    NH 35.0
## 32    NJ 34.5
## 33    NM 35.0
## 34    NV 35.0
## 35    NY 35.0
## 36    OH 35.0
## 37    OK 35.0
## 38    OR 35.0
## 39    PA 35.0
## 40    RI 32.0
## 41    SC 35.0
## 42    SD 35.0
## 43    TN 36.0
## 44    TX 35.0
## 45    UT 35.0
## 46    VA 37.5
## 47    VT 35.0
## 48    WA 35.0
## 49    WI 35.0
## 50    WV 57.5
## 51    WY 32.0

We are going to show ABV and IBU in different states in bar plots:

P.ABV = ggplot(ABV.med, aes(x = ABV, y = State)) + 
  geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
  labs(title = "Median ABV") + 
  theme_bw() + 
  theme(text = element_text(size = 8.1))

P.IBU = ggplot(IBU.med, aes(x = IBU, y = State)) + 
  geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
  labs(title = "Median IBU") + 
  theme_bw()+
  theme(text = element_text(size = 8.1))

grid.arrange(P.ABV, P.IBU,ncol = 2)

  1. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
df2=subset(merged,select=c("State","ABV","IBU"))
maxABV=subset(df2,ABV==max(ABV,na.rm = TRUE))
maxABV
##     State   ABV IBU
## 375    CO 0.128  35
maxIBU=subset(df2,IBU==max(IBU,na.rm = TRUE))
maxIBU
##      State   ABV IBU
## 1857    OR 0.082 138
ABV.max = aggregate(ABV ~ State, data = merged, FUN = max)
ABV.max[order(-ABV.max$ABV),][1,]
##   State   ABV
## 6    CO 0.128
IBU.max = aggregate(IBU ~ State, data = merged, FUN = max)
IBU.max[order(-IBU.max$IBU),][1,]
##    State IBU
## 38    OR 138
abv = ABV.max[order(-ABV.max$ABV),][1:5,]
ggplot(abv, aes(x = ABV*100, y = reorder(State,ABV))) + 
  geom_bar(stat = "identity", width = 0.5, color = "orange", fill = "gold") +
  labs(title = "Top 5 States with maximum ABV values", y = "State", x = "ABV percentage") + 
  theme_bw()+
  geom_text(aes(label = paste0(ABV*100,"%"))) +
  theme(text = element_text(size = 10))

ibu = IBU.max[order(-IBU.max$IBU),][1:5,]
ggplot(ibu, aes(x = IBU, y = reorder(State,IBU))) + 
  geom_bar(stat = "identity", width = 0.5, color = "orange", fill = "gold") +
  labs(title = "Top 5 States with maximum IBU values", y ="State") + 
  theme_bw()+
  geom_text(aes(label = IBU)) +
  theme(text = element_text(size = 10))

Colorado has the Maximum ABV at 0.128. Oregon has the Maximum IBU at 138.

  1. Comment on the summary statistics and distribution of the ABV variable.
summary(merged$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00100 0.05000 0.05600 0.05968 0.06700 0.12800
ggplot(merged, aes(x = ABV)) + 
  geom_histogram(color="blue", fill="skyblue") +
  theme_bw() +
  ggtitle("Histogram for ABV")

Null hypothesis: the ABV is normally distributed?

shapiro.test(merged$ABV)
## 
##  Shapiro-Wilk normality test
## 
## data:  merged$ABV
## W = 0.93421, p-value < 2.2e-16

ABV had not a normal distribution

  1. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot.
ggplot(df2, aes(x = ABV, y = IBU)) + 
  geom_point(color = "blue")+
  geom_smooth(method = "lm",se = F, color = "red")+
  ggtitle("Scatterplot for ABV vs IBU") + 
  theme_bw()

ggplot(df2, aes(ABV,IBU,color="blue")) + 
  geom_point(color="blue")+
  geom_smooth()

chart.Correlation(df2[,-1],histogram=TRUE,pch=100)

ABV and IBU was correlated to each other

Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). We decide to use KNN classification to investigate this relationship. We will provide statistical evidence one way or the other.

minDF = subset(merged, select = c("ABV","IBU","Style","State"),
               is.na(ABV) == FALSE & is.na(IBU) == FALSE)

minDF$S1 = NA

for(k in 1:nrow(minDF))
{
  if(grepl("IPA", minDF$Style[k], fixed = TRUE) == TRUE)
  {
    minDF$S1[k] = "IPA"
  }
  else if(grepl("Ale", minDF$Style[k], fixed = TRUE) == TRUE)
  {
    minDF$S1[k] = "Ale"
  }
}
clsdf = subset(minDF, select = c("ABV","IBU","S1","State"), is.na(S1) == FALSE)
clsdf = dplyr::rename(clsdf,Style = S1)

The relationship of ABV and IBU in terms of Ale and IPA

ggplot(clsdf,aes(ABV,IBU,color=Style))+
  geom_point(shape=4,size=2)+
  geom_smooth(method=loess,se=F)

Box plot of ABV in Ale and IPA

ggplot(clsdf,aes(Style,ABV,fill=Style))+geom_boxplot()

Box plot of IBU in Ale and IPA

ggplot(clsdf,aes(Style,IBU,fill=Style))+geom_boxplot()

The relationship between IBU and ABV in terms of Style(Ale,IPA)

ggplot(clsdf,aes(ABV,IBU,color=Style))+
  geom_point(shape=4,size=2)+
geom_smooth(method=lm,se=F)

to Compare Two Variances

var.test(IBU~Style,data=clsdf)
## 
##  F test to compare two variances
## 
## data:  IBU by Style
## F = 0.33279, num df = 962, denom df = 570, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2870047 0.3848152
## sample estimates:
## ratio of variances 
##          0.3327944

Two variance were different

ale=subset(clsdf,Style=="Ale")
ipa=subset(clsdf,Style=="IPA")

To test normality of the variables

shapiro.test(ale$IBU)
## 
##  Shapiro-Wilk normality test
## 
## data:  ale$IBU
## W = 0.85296, p-value < 2.2e-16
shapiro.test(ipa$IBU)
## 
##  Shapiro-Wilk normality test
## 
## data:  ipa$IBU
## W = 0.89739, p-value < 2.2e-16
shapiro.test(ale$ABV)
## 
##  Shapiro-Wilk normality test
## 
## data:  ale$ABV
## W = 0.90472, p-value < 2.2e-16
shapiro.test(ipa$ABV)
## 
##  Shapiro-Wilk normality test
## 
## data:  ipa$ABV
## W = 0.97267, p-value = 7.813e-09

The variables had not normal distribution

To plot the normality of the variables

qqnorm(ale$IBU)

qqnorm(ale$ABV)

qqnorm(ipa$IBU)

qqnorm(ipa$ABV)

To test the different between ABV in diffrenet styles(IPA and Ale), we should use Mann withney test

wilcox.test(clsdf$ABV ~ clsdf$Style)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  clsdf$ABV by clsdf$Style
## W = 116772, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

ABV and Style had a significant difference

wilcox.test(clsdf$IBU ~ clsdf$Style)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  clsdf$IBU by clsdf$Style
## W = 97264, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

IBU and Style had a significant difference

Using KNN:

clsdf = subset(clsdf, select = c("ABV","IBU","Style"))
idx = sample.int(n = nrow(clsdf), size = floor(0.7*nrow(clsdf)), replace = F)
train = clsdf[idx,]
test = clsdf[-idx,]
trn_target = train$Style
trn = train[,-3]
tst_target = test$Style
tst = test[,-3]

pred = knn(train = trn, test = tst, cl = trn_target, k = 6)
head(pred)
## [1] Ale Ale IPA Ale IPA IPA
## Levels: Ale IPA

To show confusion matrix:

model_table=table(tst_target,pred)
model_table
##           pred
## tst_target Ale IPA
##        Ale 234  38
##        IPA  52 137

To test the accuracy:

sum(diag(model_table))/nrow(tst)
## [1] 0.8047722
Accuracy = NULL
mis = NULL
sen = NULL
spe = NULL

for(i in 1:50)
{
  pred = knn(train = trn, test = tst, cl = trn_target, k = i)
  head(pred)
  model_table=table(trn_target)
  tab = table(Predicted = pred, Real = tst_target)
  Accuracy[i] = ((tab[1,1] + tab[2,2])/sum(tab))*100
  mis[i] = round((tab[1,2]+tab[2,1])/sum(tab),2)
  sen[i] = round(tab[2,2]/(tab[2,2]+tab[1,2]),2)
  spe[i] = round(tab[1,1]/(tab[1,1]+tab[2,1]),2)
}
plot(x = c(1:50), y = Accuracy, xlab = "k", pch = 19, type = "b")
abline(v = which.max(Accuracy), col = "red", lwd = 2)

data.frame(Measure = c("Accuracy","Misclassification Rate","Sensitivity","Specificity"),
           Value = c(round(Accuracy[6],2),round(mis[6],2),round(sen[6],2),round(spe[6],2)))
##                  Measure Value
## 1               Accuracy 80.69
## 2 Misclassification Rate  0.19
## 3            Sensitivity  0.71
## 4            Specificity  0.87

9.Find one other useful inference from the data that you feel Budweiser may be able to find value in.

oz = aggregate(Ounces ~ State, data = merged, FUN = sum)
ggplot(oz, aes(x = Ounces, y = reorder(State,Ounces))) + 
  geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
  labs(title = "Median Ounces per State", y = " Ounces") + 
  theme_bw() + 
  theme(text = element_text(size = 8.1))

co = subset(merged, State == " CO")
oz_co = aggregate(Ounces ~ City, data = co, FUN = sum)

ggplot(oz_co, aes(x = Ounces, y = reorder(City,Ounces))) + 
  geom_bar(stat = "identity", width = 0.5, color = "blue", fill = "skyblue") +
  labs(title = "Number of Ounces for Cities in Colorado", y = " Ounces")  + 
  theme_bw() +
  theme(text = element_text(size = 8.1))

To evaluate the relationship between Ounces in different cities of Colorado

Anova_result= aov(Ounces ~ City, data = co)
summary(Anova_result)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## City         26  415.3  15.972   3.318 5.61e-07 ***
## Residuals   238 1145.5   4.813                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
minDF1 = subset(co, select = c("Ounces","Style"),is.na(Ounces) == FALSE)
minDF1$S1 = NA
for(k in 1:nrow(minDF1))
{
  if(grepl("IPA", minDF1$Style[k], fixed = TRUE) == TRUE)
  {
    minDF1$S1[k] = "IPA"
  }
  else if(grepl("Ale", minDF1$Style[k], fixed = TRUE) == TRUE)
  {
    minDF1$S1[k] = "Ale"
  }
}
aggregate(Ounces ~ S1, data = minDF1, FUN = sum)
##    S1 Ounces
## 1 Ale 1600.0
## 2 IPA  770.4

Test Normality of Ounces

shapiro.test(minDF1$Ounces)
## 
##  Shapiro-Wilk normality test
## 
## data:  minDF1$Ounces
## W = 0.6026, p-value < 2.2e-16
qqnorm(minDF1$Ounces)

Evaluation of the Ounces between two styles(IPA and Ale) in different cities in Colorado

wilcox.test(minDF1$Ounces ~ minDF1$S1)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  minDF1$Ounces by minDF1$S1
## W = 3196, p-value = 0.3247
## alternative hypothesis: true location shift is not equal to 0

The difference between Ounces and styles wasnot significant